import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier  # 分类树
from sklearn.tree import export_graphviz

data = pd.read_csv("titanic.csv",
                   index_col=0)

# data.info()# print(data)

# 特征：年龄age 633有值； 性别sex 完整； 舱位pclass 完整
X = data[["age", "sex", "pclass"]].copy()
y = data["survived"]  # 1 活  0 死

# 数据清洗
# 年龄为空 填充一个 均值/众数/中位
# print("年龄的均值/众数/中位", X["age"].mean(), X["age"].mode()[0], X["age"].median())

X.fillna({"age": X["age"].median()}, inplace=True)

# 将特征非数值变成数值
# 舱位转换为数值
X["pclass"] = X["pclass"].str[0]
X["pclass"] = X["pclass"].astype(int)

# 性别有不同的取值
# print(X["sex"].unique())

X["sex"] = (X["sex"] == 'male').astype(int)

# print("特征\n", X)
feature_names = X.columns


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    random_state=1,
    test_size=0.2,
    stratify=y
)


dt = DecisionTreeClassifier(max_depth=5)

dt.fit(X_train, y_train)

print("准确率", dt.score(X_test, y_test))

# 将树导出到dot格式的文件
export_graphviz(dt, out_file="titanic.dot",
                feature_names=feature_names,
                class_names=["死", '生']
                )

# 将dot文件 转为 图片
# dot -Tpng tree.dot -o tree.png